林嶔 (Lin, Chin)
Lesson 6
– 請至這裡下載本週的範例資料
dat = read.csv("Example data.csv", header = TRUE)
head(dat)
## eGFR Disease Survival.time Death Diabetes Cancer SBP DBP
## 1 34.65379 1 0.4771037 0 0 1 121.2353 121.3079
## 2 37.21183 1 3.0704424 0 1 1 122.2000 122.6283
## 3 32.60074 1 0.2607117 1 0 0 118.9136 121.7621
## 4 29.68481 1 NA NA 0 0 118.2212 112.7043
## 5 28.35726 0 0.1681673 1 0 0 116.7469 115.7705
## 6 33.95012 1 1.2238556 0 0 0 119.9936 116.3872
## Education Income
## 1 2 0
## 2 2 0
## 3 0 0
## 4 1 0
## 5 0 0
## 6 1 0
– 這時候我們需要使用函數「chisq.test()」及「fisher.test()」
TABLE1 = table(dat[,"Diabetes"], dat[,"Education"])
TABLE1
##
## 0 1 2
## 0 403 231 148
## 1 89 54 37
#也可以這樣下指令,會得到完全一樣的結果
TABLE2 = table(dat[,c("Diabetes", "Education")])
TABLE2
## Education
## Diabetes 0 1 2
## 0 403 231 148
## 1 89 54 37
Result1 = chisq.test(TABLE1)
Result1
##
## Pearson's Chi-squared test
##
## data: TABLE1
## X-squared = 0.33753, df = 2, p-value = 0.8447
Result2 = fisher.test(TABLE2)
Result2
##
## Fisher's Exact Test for Count Data
##
## data: TABLE2
## p-value = 0.8318
## alternative hypothesis: two.sided
ls(Result1)
## [1] "data.name" "expected" "method" "observed" "parameter" "p.value"
## [7] "residuals" "statistic" "stdres"
ls(Result2)
## [1] "alternative" "data.name" "method" "p.value"
– 把它寫成程式碼的話,計算期望值表的方法如下
TABLE1 = table(dat[,"Diabetes"], dat[,"Education"])
TABLE1
##
## 0 1 2
## 0 403 231 148
## 1 89 54 37
marginal.n1 = rep(NA, nrow(TABLE1))
for (i in 1:nrow(TABLE1)) {
marginal.n1[i] = sum(TABLE1[i,])
}
marginal.n1
## [1] 782 180
marginal.n2 = rep(NA, ncol(TABLE1))
for (i in 1:ncol(TABLE1)) {
marginal.n2[i] = sum(TABLE1[,i])
}
marginal.n2
## [1] 492 285 185
N = sum(TABLE1)
N
## [1] 962
expected.TABLE = marginal.n1%*%t(marginal.n2)/N
expected.TABLE
## [,1] [,2] [,3]
## [1,] 399.94179 231.6736 150.38462
## [2,] 92.05821 53.3264 34.61538
RESULT = expected.TABLE > 5
RESULT
## [,1] [,2] [,3]
## [1,] TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE
sum(RESULT)/length(RESULT)
## [1] 1
請製作一個函數,使用者可以給定兩個類別變項,之後先進行期望值計算的部分判斷應該使用卡方檢定或是Fisher exact test,最後再進行計算。
除此之外,請速度較快的同學,將這個函數與上週的函數做結合,讓函數滿足下列功能:
使用這可以任意輸入兩個等長變項做檢定(若不等長則警告並停止執行函數)
其中一個變項必須為類別變項,第二個變項隨使用者指定
若第二個變項為連續變項,則先考慮第一個變項的類別數,若類別數為2則走t test及Wilcoxon test路線;若類別數大於2則走ANOVA及Kruskal-Wallis test路線
若第二個變項為類別變項,則使用卡方檢定或Fisher exact test
– 註:請在函數內幫助使用者判斷該使用哪一種檢定法,並直接將報表寫出
my_func.analysis = function (x, y) {
lvl.x = levels(factor(x))
n.lvl.x = length(lvl.x)
lvl.y = levels(factor(y))
n.lvl.y = length(lvl.y)
if (n.lvl.y < 10) {
TABLE1 = table(x, y)
marginal.n1 = rep(NA, nrow(TABLE1))
for (i in 1:nrow(TABLE1)) {
marginal.n1[i] = sum(TABLE1[i,])
}
marginal.n2 = rep(NA, ncol(TABLE1))
for (i in 1:ncol(TABLE1)) {
marginal.n2[i] = sum(TABLE1[,i])
}
N = sum(TABLE1)
expected.TABLE = marginal.n1%*%t(marginal.n2)/N
expected.TABLE
RESULT = expected.TABLE > 5
if (sum(RESULT)/length(RESULT) > 0.8) {
result = chisq.test(TABLE1)
} else {
result = fisher.test(TABLE1)
}
} else {
n.each = numeric(n.lvl.x)
for (i in 1:n.lvl.x) {
current_vec = y[x == lvl.x[i]]
n.each[i] = sum(table(current_vec))
}
if (n.lvl.x > 2) {
if (FALSE %in% (n.each > 25)) {
result = kruskal.test(y~x)
} else {
Variance.table = aov(y~as.factor(x))
result = anova(Variance.table)
}
} else {
if (FALSE %in% (n.each > 25)) {
result = wilcox.test(y~x)
} else {
var.result = var.test(y~x)
if (var.result[['p.value']] < 0.05) {
result = t.test(y~x, var.equal = FALSE)
} else {
result = t.test(y~x, var.equal = TRUE)
}
}
}
}
result
}
my_func.analysis(dat$Income, dat$Education)
##
## Pearson's Chi-squared test
##
## data: TABLE1
## X-squared = 8.5353, df = 4, p-value = 0.07383
my_func.analysis(dat[dat$Income == 2 & dat$Diabetes == 1,]$Education, dat[dat$Income == 2 & dat$Diabetes == 1,]$Death)
##
## Fisher's Exact Test for Count Data
##
## data: TABLE1
## p-value = 0.3824
## alternative hypothesis: two.sided
my_func.analysis(dat$Disease, dat$eGFR)
##
## Welch Two Sample t-test
##
## data: y by x
## t = -2.4556, df = 466.06, p-value = 0.01443
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.2043011 -0.2445876
## sample estimates:
## mean in group 0 mean in group 1
## 33.19008 34.41452
my_func.analysis(dat[dat$Income == 2,]$Disease, dat[dat$Income == 2,]$eGFR)
##
## Wilcoxon rank sum test with continuity correction
##
## data: y by x
## W = 691, p-value = 0.2357
## alternative hypothesis: true location shift is not equal to 0
my_func.analysis(dat$Income, dat$eGFR)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(x) 2 102 51.016 1.0115 0.3641
## Residuals 944 47613 50.438
my_func.analysis(dat[dat$Income == 2,]$Education, dat[dat$Income == 2,]$eGFR)
##
## Kruskal-Wallis rank sum test
##
## data: y by x
## Kruskal-Wallis chi-squared = 8.7301, df = 2, p-value = 0.01271
還記得上節課的回家作業嗎?在科學論文中,第一張表一般來說會先描述不同『組別』各變項之關係,
若我們給定一個連續變項,我們需要呈現這樣的表格:
## Variable Diabetes:0 Diabetes:1 p-value
## 1 "eGFR" "32.9±6.6" "39.8±6.5" "<0.001"
## Variable Diabetes:0 Diabetes:1 p-value
## 1 "Education" "" "" "0.845"
## 2 "Education:0" "403(51.5%)" "89(49.4%)" ""
## 3 "Education:1" "231(29.5%)" "54(30.0%)" ""
## 4 "Education:2" "148(18.9%)" "37(20.6%)" ""
## Variable Diabetes:0 Diabetes:1 p-value
## 1 "Education" "" "" "0.845"
## 2 "Education:0" "403(51.5%)" "89(49.4%)" ""
## 3 "Education:1" "231(29.5%)" "54(30.0%)" ""
## 4 "Education:2" "148(18.9%)" "37(20.6%)" ""
## 5 "eGFR" "32.9±6.6" "39.8±6.5" "<0.001"
## 6 "Income" "" "" "0.673"
## 7 "Income:0" "618(79.2%)" "141(77.5%)" ""
## 8 "Income:1" "77(9.9%)" "22(12.1%)" ""
## 9 "Income:2" "85(10.9%)" "19(10.4%)" ""
## 10 "SBP" "119.7±9.9" "130.3±9.8" "<0.001"
## 11 "DBP" "120.0±9.6" "128.3±9.3" "<0.001"
– 在不使用迴歸相關的技術前提下,我們只剩下相關系數能夠使用了
– 函數「cor.test()」可以用來計算相關係數
Result3 = cor.test(dat[,"DBP"],dat[,"SBP"], method = "pearson")
Result3
##
## Pearson's product-moment correlation
##
## data: dat[, "DBP"] and dat[, "SBP"]
## t = 53.178, df = 952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8480377 0.8801056
## sample estimates:
## cor
## 0.8649519
Result4 = cor.test(dat[,"DBP"],dat[,"SBP"], method = "spearman")
Result4
##
## Spearman's rank correlation rho
##
## data: dat[, "DBP"] and dat[, "SBP"]
## S = 21264000, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8530528
使用這可以任意輸入兩個等長連續變項做檢定(若不等長則警告並停止執行函數)
判斷Sample size是否小於25,如果大於則使用Pearson correlation,小於則使用Spearman correlation
my_func.correlation = function (x, y) {
if (length(x) != length(y)) {
cat("x與y不等長")
} else {
n.sample = sum(!is.na(x) & !is.na(y))
if (n.sample >= 25) {
result = cor.test(x, y, method = "pearson")
} else {
result = cor.test(x, y, method = "spearman")
}
result
}
}
my_func.correlation(dat[,"DBP"],dat[,"SBP"])
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 53.178, df = 952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8480377 0.8801056
## sample estimates:
## cor
## 0.8649519
my_func.correlation(dat[dat$Income == 2 & dat$Diabetes == 1,"DBP"], dat[dat$Income == 2 & dat$Diabetes == 1,"SBP"])
##
## Spearman's rank correlation rho
##
## data: x and y
## S = 214, p-value = 0.0002096
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7791538
– 非連續變項的相關係數最常使用的是『Polychoric correlation』
– 我們可以透過下列方法安裝這個套件
– 或是使用函數『install.packages()』安裝指定套件
install.packages("polycor")
library("polycor")
一般來說,我們會先找到疑似是我們要的函數,在這個案例中很明顯,函數「polychor()」就是我們需要的。
在函數說明的下方,通常會有Examples,我們可以試著看看他的Examples寫著什麼。
– 如果你看不懂每一行的指令在寫甚麼,請一行一行看,並了解該函數的input的格式是什麼。
if(require(mvtnorm)){
set.seed(12345)
data <- rmvnorm(1000, c(0, 0), matrix(c(1, .5, .5, 1), 2, 2))
x <- data[,1]
y <- data[,2]
cor(x, y) # sample correlation
}
## [1] 0.5263698
if(require(mvtnorm)){
x <- cut(x, c(-Inf, .75, Inf))
y <- cut(y, c(-Inf, -1, .5, 1.5, Inf))
polychor(x, y) # 2-step estimate
}
## [1] 0.5230474
if(require(mvtnorm)){
set.seed(12345)
polychor(x, y, ML=TRUE, std.err=TRUE) # ML estimate
}
##
## Polychoric Correlation, ML est. = 0.5231 (0.03819)
## Test of bivariate normality: Chisquare = 2.739, df = 2, p = 0.2543
##
## Row Threshold
## Threshold Std.Err.
## 0.7537 0.04403
##
##
## Column Thresholds
## Threshold Std.Err.
## 1 -0.9842 0.04746
## 2 0.4841 0.04127
## 3 1.5010 0.06118
– 但是可能因為臨床意義等原因(如空腹血糖126以上被定義為糖尿病),使這兩個變項被置換成類別/序位變項。
– 因此Polychoric correlation的想法是,想辦法將兩個類別變項還原成連續變項,再進行相關係數檢定。
– 所以在範例的程式碼,第一個部分是真的製造兩個連續變項,並做Pearson correlation;第二個部分則是強硬的把他切割成數份,並計算Polychoric correlation看答案是不是類似
if(require(mvtnorm)){
set.seed(12345)
data <- rmvnorm(1000, c(0, 0), matrix(c(1, .5, .5, 1), 2, 2))
x <- data[,1]
y <- data[,2]
cor(x, y) # sample correlation
}
## [1] 0.5263698
if(require(mvtnorm)){
x <- cut(x, c(-Inf, .75, Inf))
y <- cut(y, c(-Inf, -1, .5, 1.5, Inf))
polychor(x, y) # 2-step estimate
}
## [1] 0.5230474
Result5 = polychor(dat[,"Disease"], dat[,"Diabetes"])
Result5
## [1] 0.1818627
Result6 = polyserial(dat[,"eGFR"], dat[,"Disease"])
Result6
## [1] 0.1042781
Result7 = polychor(dat[,"Disease"], dat[,"Diabetes"], std.err=TRUE)
Result7
##
## Polychoric Correlation, 2-step est. = 0.1819 (0.06496)
Result8 = polyserial(dat[,"eGFR"], dat[,"Disease"], std.err=TRUE)
Result8
##
## Polyserial Correlation, 2-step est. = 0.1063 (0.04478)
## Test of bivariate normality: Chisquare = 8.175, df = 5, p = 0.1468
– 在R裡面可以簡單的使用函數「pchisq()」求卡方分布的累積分布值,所以我們可以利用這個方式來算p value
Z = Result7$rho/sqrt(Result7$var)
CHISQ = Z^2
pchisq(CHISQ, df = 1, lower.tail = FALSE)
## [,1]
## [1,] 0.005112662
pchisq(Result8$chisq, Result8$df, lower.tail = FALSE)
## [1] 0.1468378
使用這可以任意輸入兩個等長變項做檢定(若不等長則警告並停止執行函數)
判斷使用者輸入的變項是類別(factor)或是連續,並且使用適當的方法檢定
如果是兩個都是連續變項,先判斷Sample size是否小於25,如果大於則使用Pearson correlation,小於則使用Spearman correlation
只要匯出三個資訊:(1)相關係數、(2)p value、(3)用什麼方法
在做相關係數時,通常會給一系列變項,並且做相關矩陣。
做Pearson correlation matrix相當簡單,我們只要輸入下列指令即可:
cor(dat[,c("eGFR", "SBP", "DBP")], use = "pairwise.complete.obs")
## eGFR SBP DBP
## eGFR 1.0000000 0.9683834 0.9192064
## SBP 0.9683834 1.0000000 0.8649519
## DBP 0.9192064 0.8649519 1.0000000
– 如果厲害一點的話,請在相關矩陣的上半部填入p value,而下半部則填入相關係數。
library("polycor")
my_func.correlation = function (x, y) {
if (length(x) != length(y)) {
cat("x與y不等長")
} else {
lvl.x = levels(factor(x))
n.lvl.x = length(lvl.x)
lvl.y = levels(factor(y))
n.lvl.y = length(lvl.y)
if (n.lvl.x > 10 & n.lvl.y > 10) {
n.sample = sum(!is.na(x) & !is.na(y))
if (n.sample >= 25) {
result = cor.test(x, y, method = "pearson")
method = "pearson"
} else {
result = cor.test(x, y, method = "spearman")
method = "spearman"
}
rho = result$estimate
p_val = result$p.value
} else {
if (n.lvl.x < 10) {
if (n.lvl.y < 10) {
result = polychor(x, y, std.err=TRUE)
Z = result$rho/sqrt(result$var)
CHISQ = Z^2
p_val = pchisq(CHISQ, df = 1, lower.tail = FALSE)
method = "polychor"
} else {
result = polyserial(y, x, std.err=TRUE)
p_val = pchisq(result$chisq, result$df, lower.tail = FALSE)
method = "polyserial"
}
} else {
result = polyserial(x, y, std.err=TRUE)
p_val = pchisq(result$chisq, result$df, lower.tail = FALSE)
method = "polyserial"
}
rho = result$rho
}
my_list = list(rho, p_val, method)
names(my_list) = c('rho', 'p_val', 'method')
my_list
}
}
my_func.correlation(dat[,"DBP"],dat[,"SBP"])
## $rho
## cor
## 0.8649519
##
## $p_val
## [1] 2.665107e-287
##
## $method
## [1] "pearson"
my_func.correlation(dat[dat$Income == 2 & dat$Diabetes == 1,"DBP"], dat[dat$Income == 2 & dat$Diabetes == 1,"SBP"])
## $rho
## rho
## 0.7791538
##
## $p_val
## [1] 0.0002095772
##
## $method
## [1] "spearman"
my_func.correlation(dat[,"Income"],dat[,"SBP"])
## $rho
## [1] 0.001014369
##
## $p_val
## [1] 0.8523447
##
## $method
## [1] "polyserial"
my_func.correlation(dat[,"Income"],dat[,"Disease"])
## $rho
## [1] 0.008209949
##
## $p_val
## [,1]
## [1,] 0.8935464
##
## $method
## [1] "polychor"
my_func.correlation(dat[,"eGFR"],dat[,"Disease"])
## $rho
## [1] 0.1063003
##
## $p_val
## [1] 0.1468378
##
## $method
## [1] "polyserial"
my_func.correlation_matrix = function (data) {
correlation_matrix = matrix(1, nrow = ncol(data), ncol = ncol(data))
rownames(correlation_matrix) = colnames(data)
colnames(correlation_matrix) = colnames(data)
for (i in 1:ncol(data)) {
for (j in 1:ncol(data)) {
if (j < i) {
result_list = my_func.correlation(data[,i],data[,j])
correlation_matrix[j,i] = result_list$rho
correlation_matrix[i,j] = result_list$p_val
}
}
}
correlation_matrix
}
my_func.correlation_matrix(data = dat)
## eGFR Disease Survival.time Death
## eGFR 1.0000000 1.063003e-01 2.217909e-02 -0.077650458
## Disease 0.1468378 1.000000e+00 5.550212e-02 -0.001989727
## Survival.time 0.5046980 1.961238e-208 1.000000e+00 0.043674513
## Death 0.2418072 9.720354e-01 6.345820e-221 1.000000000
## Diabetes 0.1589850 5.112662e-03 6.547588e-202 0.786730236
## Cancer 0.1153516 9.857535e-01 3.406996e-215 0.585043216
## SBP 0.0000000 7.060265e-01 4.046682e-01 0.234451469
## DBP 0.0000000 3.396927e-02 6.293485e-01 0.199337449
## Education 0.3037279 1.264179e-01 6.793488e-220 0.315127602
## Income 0.4530232 8.935464e-01 4.130601e-215 0.750710303
## Diabetes Cancer SBP DBP
## eGFR 0.55061563 0.036718500 9.683834e-01 0.91920635
## Disease 0.18187368 0.001003631 1.147569e-01 0.07783083
## Survival.time 0.02964107 -0.006368456 2.759606e-02 -0.01588972
## Death 0.01621756 -0.028495907 -7.325610e-02 -0.06317791
## Diabetes 1.00000000 0.082807411 5.587485e-01 0.47481803
## Cancer 0.16045071 1.000000000 4.945587e-02 0.03467301
## SBP 0.63326768 0.528875761 1.000000e+00 0.86495187
## DBP 0.68352697 0.402321710 2.665107e-287 1.00000000
## Education 0.56456435 0.355532672 7.483356e-01 0.76053101
## Income 0.72928037 0.444110047 8.523447e-01 0.35923294
## Education Income
## eGFR 0.06375443 0.051815168
## Disease 0.07781775 0.008209949
## Survival.time 0.51042244 0.026751086
## Death 0.04756400 0.017904954
## Diabetes 0.03098767 0.022109389
## Cancer 0.04330285 0.042625790
## SBP 0.05110441 0.001014369
## DBP 0.02460452 -0.020832642
## Education 1.00000000 -0.021262920
## Income 0.67511363 1.000000000
本次課程中同學學習到最重要的是套件的安裝,以及套件說明閱讀。
結合之前的課程,我們將可以透過函數的組合產生我們想要的結果。
單變項分析的部分就到這裡結束,下週開始我們會介紹廣義線性模型&Cox比例風險模式
– Cox比例風險模式需要安裝額外的套件才能運算,請同學回家後找一下是用哪一個套件,熟讀範例並弄清楚該怎麼操作,下次上課我們會從同學的報告先開始。